library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
#read in all.rds(output from feature_engineering.Rmd)
all=readRDS("all_DP_FE.rds")
#str(all)
Removal high mulitcollinear variables
#removing one of the two variables that has high correlation with another variable. The variable to drop is based on its correlation with the SalePrice.
rm_vars=c("YearRemodAdd","GarageYrBlt","GarageArea","GarageCond","TotalBsmtSF", "TotRmsAbvGrd","BsmtFinSF1")
all=all[,!(names(all) %in% rm_vars)]
#check dimension
dim(all)
[1] 2919 78
Separate the numeric from the encoded variables
#setting the name list for numeric variables
numericVarNames=numericVarNames[!(numericVarNames %in% c('SalePrice','MSSubClass','MoSold','OverallQual','OverallCond','YrSold'))]
#include some of the features engineered into the list
numericVarNames=append(numericVarNames,c('Age','Total_Bathrooms','Total_Living_Area'))
#creating the numeric dataframe
Numeric_DF=all[,names(all) %in% numericVarNames]
#creating the factor dataframe
Factors_DF=all[,!(names(all) %in% numericVarNames)]
Factors_DF=Factors_DF[,names(Factors_DF)!='SalePrice']
#check the number of variables in each dataframe
sprintf("There are %s numeric and %s factor variables", dim(Numeric_DF)[2],dim(Factors_DF)[2])
[1] "There are 28 numeric and 49 factor variables"
Fixing skewed predictator variables
#adjust skewed predictator variables > 0.8
for(i in 1:(dim(Numeric_DF)[2])){
if(abs(skew(Numeric_DF[,i]))>0.8){
Numeric_DF[,i]=log(Numeric_DF[,i]+1)
}
}
Normalization
#normalizing of the data
tmp=preProcess(Numeric_DF,method=c("center", "scale"))
DF_norm=predict(tmp,Numeric_DF)
#check normalized values
DF_norm
Implementation of one hot encoding
#before one-hot implementation
Factors_DF
#encodin Factors_DF
DF_tmp=as.data.frame(model.matrix(~.-1,Factors_DF))
#check the one-hot encoded output
DF_tmp
Skewed response variable
all["SalePrice"]
#before implementation of log function
hist(all$SalePrice)

skew(all$SalePrice)
[1] 1.879009
#implementation of log function
all$SalePrice=log(all$SalePrice)
#check the implementation
hist(all$SalePrice)

skew(all$SalePrice)
[1] 0.1210859
full=cbind(DF_norm, DF_tmp,all["SalePrice"])
#check full data
full
train=full[!is.na(all$SalePrice),]
test=full[is.na(all$SalePrice),]
dim(train)
[1] 1460 230
#split back to train and test set
saveRDS(train,file="train.rds")
saveRDS(test,file="test.rds")
LS0tDQp0aXRsZTogIlByZS1Qcm9jZXNzaW5nIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KYGBge3J9DQpsaWJyYXJ5KGtuaXRyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShwbHlyKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoY29ycnBsb3QpDQpsaWJyYXJ5KGNhcmV0KQ0KbGlicmFyeShncmlkRXh0cmEpDQpsaWJyYXJ5KHNjYWxlcykNCmxpYnJhcnkoUm1pc2MpDQpsaWJyYXJ5KGdncmVwZWwpDQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkNCmxpYnJhcnkocHN5Y2gpDQpsaWJyYXJ5KHhnYm9vc3QpDQpgYGANCg0KYGBge3J9DQojcmVhZCBpbiBhbGwucmRzKG91dHB1dCBmcm9tIGZlYXR1cmVfZW5naW5lZXJpbmcuUm1kKQ0KYWxsPXJlYWRSRFMoImFsbF9EUF9GRS5yZHMiKQ0KI3N0cihhbGwpDQpgYGANCg0KDQojIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyBSZW1vdmFsIGhpZ2ggbXVsaXRjb2xsaW5lYXIgdmFyaWFibGVzICMjIyMjIyMjIyMjIyMjIyMjIyMNCg0KDQpgYGB7cn0NCiNyZW1vdmluZyBvbmUgb2YgdGhlIHR3byB2YXJpYWJsZXMgdGhhdCBoYXMgaGlnaCBjb3JyZWxhdGlvbiB3aXRoIGFub3RoZXIgdmFyaWFibGUuIFRoZSB2YXJpYWJsZSB0byBkcm9wIGlzIGJhc2VkIG9uIGl0cyBjb3JyZWxhdGlvbiB3aXRoIHRoZSBTYWxlUHJpY2UuDQpybV92YXJzPWMoIlllYXJSZW1vZEFkZCIsIkdhcmFnZVlyQmx0IiwiR2FyYWdlQXJlYSIsIkdhcmFnZUNvbmQiLCJUb3RhbEJzbXRTRiIsICJUb3RSbXNBYnZHcmQiLCJCc210RmluU0YxIikNCmFsbD1hbGxbLCEobmFtZXMoYWxsKSAlaW4lIHJtX3ZhcnMpXQ0KDQojY2hlY2sgZGltZW5zaW9uDQpkaW0oYWxsKQ0KYGBgDQoNCiMjIyMjIyMjIyMjIyMjIyMjIyBTZXBhcmF0ZSB0aGUgbnVtZXJpYyBmcm9tIHRoZSBlbmNvZGVkIHZhcmlhYmxlcyAjIyMjIyMjIyMjIyMjDQoNCmBgYHtyfQ0KDQojc2V0dGluZyB0aGUgbmFtZSBsaXN0IGZvciBudW1lcmljIHZhcmlhYmxlcw0KbnVtZXJpY1Zhck5hbWVzPW51bWVyaWNWYXJOYW1lc1shKG51bWVyaWNWYXJOYW1lcyAlaW4lIGMoJ1NhbGVQcmljZScsJ01TU3ViQ2xhc3MnLCdNb1NvbGQnLCdPdmVyYWxsUXVhbCcsJ092ZXJhbGxDb25kJywnWXJTb2xkJykpXSANCiNpbmNsdWRlIHNvbWUgb2YgdGhlIGZlYXR1cmVzIGVuZ2luZWVyZWQgaW50byB0aGUgbGlzdA0KbnVtZXJpY1Zhck5hbWVzPWFwcGVuZChudW1lcmljVmFyTmFtZXMsYygnQWdlJywnVG90YWxfQmF0aHJvb21zJywnVG90YWxfTGl2aW5nX0FyZWEnKSkNCg0KI2NyZWF0aW5nIHRoZSBudW1lcmljIGRhdGFmcmFtZQ0KTnVtZXJpY19ERj1hbGxbLG5hbWVzKGFsbCkgJWluJSBudW1lcmljVmFyTmFtZXNdDQojY3JlYXRpbmcgdGhlIGZhY3RvciBkYXRhZnJhbWUNCkZhY3RvcnNfREY9YWxsWywhKG5hbWVzKGFsbCkgJWluJSBudW1lcmljVmFyTmFtZXMpXQ0KRmFjdG9yc19ERj1GYWN0b3JzX0RGWyxuYW1lcyhGYWN0b3JzX0RGKSE9J1NhbGVQcmljZSddDQoNCiNjaGVjayB0aGUgbnVtYmVyIG9mIHZhcmlhYmxlcyBpbiBlYWNoIGRhdGFmcmFtZQ0Kc3ByaW50ZigiVGhlcmUgYXJlICVzIG51bWVyaWMgYW5kICVzIGZhY3RvciB2YXJpYWJsZXMiLCBkaW0oTnVtZXJpY19ERilbMl0sZGltKEZhY3RvcnNfREYpWzJdKQ0KYGBgDQoNCiMjIyMjIyMjIyMjIyMjIyMjIyBGaXhpbmcgc2tld2VkIHByZWRpY3RhdG9yIHZhcmlhYmxlcyAjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIw0KDQpgYGB7cn0NCiNhZGp1c3Qgc2tld2VkIHByZWRpY3RhdG9yIHZhcmlhYmxlcyA+IDAuOA0KZm9yKGkgaW4gMTooZGltKE51bWVyaWNfREYpWzJdKSl7DQogICAgaWYoYWJzKHNrZXcoTnVtZXJpY19ERlssaV0pKT4wLjgpew0KICAgICAgICAgIE51bWVyaWNfREZbLGldPWxvZyhOdW1lcmljX0RGWyxpXSsxKQ0KICAgICAgICB9DQp9DQpgYGANCg0KIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyBOb3JtYWxpemF0aW9uICMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIw0KDQpgYGB7cn0NCiNub3JtYWxpemluZyBvZiB0aGUgZGF0YQ0KdG1wPXByZVByb2Nlc3MoTnVtZXJpY19ERixtZXRob2Q9YygiY2VudGVyIiwgInNjYWxlIikpDQpERl9ub3JtPXByZWRpY3QodG1wLE51bWVyaWNfREYpDQoNCiNjaGVjayBub3JtYWxpemVkIHZhbHVlcw0KREZfbm9ybQ0KYGBgDQoNCiMjIyMjIyMjIyMjIyMjIyMgSW1wbGVtZW50YXRpb24gb2Ygb25lIGhvdCBlbmNvZGluZyAjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjDQoNCmBgYHtyfQ0KI2JlZm9yZSBvbmUtaG90IGltcGxlbWVudGF0aW9uDQpGYWN0b3JzX0RGDQoNCiNlbmNvZGluIEZhY3RvcnNfREYNCkRGX3RtcD1hcy5kYXRhLmZyYW1lKG1vZGVsLm1hdHJpeCh+Li0xLEZhY3RvcnNfREYpKQ0KDQojY2hlY2sgdGhlIG9uZS1ob3QgZW5jb2RlZCBvdXRwdXQNCkRGX3RtcA0KYGBgDQoNCiMjIyMjIyMjIyMjIyMjIyMjIFNrZXdlZCByZXNwb25zZSB2YXJpYWJsZSAjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIw0KDQpgYGB7cn0NCmFsbFsiU2FsZVByaWNlIl0NCg0KI2JlZm9yZSBpbXBsZW1lbnRhdGlvbiBvZiBsb2cgZnVuY3Rpb24NCmhpc3QoYWxsJFNhbGVQcmljZSkNCnNrZXcoYWxsJFNhbGVQcmljZSkNCg0KI2ltcGxlbWVudGF0aW9uIG9mIGxvZyBmdW5jdGlvbg0KYWxsJFNhbGVQcmljZT1sb2coYWxsJFNhbGVQcmljZSkgDQojY2hlY2sgdGhlIGltcGxlbWVudGF0aW9uDQpoaXN0KGFsbCRTYWxlUHJpY2UpDQpza2V3KGFsbCRTYWxlUHJpY2UpDQpgYGANCg0KYGBge3J9DQpmdWxsPWNiaW5kKERGX25vcm0sIERGX3RtcCxhbGxbIlNhbGVQcmljZSJdKQ0KI2NoZWNrIGZ1bGwgZGF0YQ0KZnVsbA0KDQp0cmFpbj1mdWxsWyFpcy5uYShhbGwkU2FsZVByaWNlKSxdDQp0ZXN0PWZ1bGxbaXMubmEoYWxsJFNhbGVQcmljZSksXQ0KZGltKHRyYWluKQ0KDQojc3BsaXQgYmFjayB0byB0cmFpbiBhbmQgdGVzdCBzZXQNCnNhdmVSRFModHJhaW4sZmlsZT0idHJhaW4ucmRzIikNCnNhdmVSRFModGVzdCxmaWxlPSJ0ZXN0LnJkcyIpDQpgYGANCg0KDQoNCg0K